package geppetto.cat.programs;


import geppetto.cat.corpus.BilingualCorpus;

import java.io.IOException;
import java.io.PrintStream;



public class ConvertCorpus {
	
	public static void main(String[] args) throws IOException {
		String corpusDescription = args[0];
		int size = Integer.parseInt(args[1]); // 100k
		int maxSentenceSize = Integer.parseInt(args[2]); // 40
		String outputDir = args[3];
		System.out.println("Size " + size);
		System.out.println("Max Sentence size " + maxSentenceSize);
		
		BilingualCorpus corpus = BilingualCorpus.getCorpusFromFileDescription(corpusDescription, size, maxSentenceSize);
		PrintStream out = new PrintStream(outputDir+"/train-indexes-"+size);
		
		for(int i = 0; i < corpus.getNumberOfTrainingSentences(); i++){
			int[] sourceSentence = corpus.getSourceSentence(i, BilingualCorpus.TRAIN_CORPUS);
			int[] foreignSentence = corpus.getForeignSentence(i, BilingualCorpus.TRAIN_CORPUS);
			for(int si = 0 ; si < sourceSentence.length-1; si++){
				out.print(sourceSentence[si]+",");
			}
			out.print(sourceSentence[sourceSentence.length-1]+"\t");
			for(int fi = 0 ; fi < foreignSentence.length-1; fi++){
				out.print(foreignSentence[fi]+",");
			}
			out.print(foreignSentence[foreignSentence.length-1]+"\n");
		}
	}
}
